{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "1QiCFLer1FIe" }, "source": [ "**Lab 14 – Gradient boosting and ensemble learning**" ] }, { "cell_type": "markdown", "metadata": { "id": "vCyq3-8y1FIj" }, "source": [ "_This notebook contains the sample from https://github.com/ageron/handson-ml3/blob/main/07_ensemble_learning_and_random_forests.ipynb, https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn and https://github.com/catboost/tutorials" ] }, { "cell_type": "markdown", "metadata": { "id": "9J5g6PDs1FIk" }, "source": [ "\n", " \n", " \n", "
\n", " \"Open\n", " \n", " \n", "
" ] }, { "cell_type": "code", "source": [ "!pip install scikeras[tensorflow] -qq\n", "!pip install xgboost -U -qq\n", "!pip install lightgbm -U -qq\n", "!pip install catboost -U\n", "!pip install shap" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Dpt39LbKyuet", "outputId": "d814ae5f-37c3-46a7-b68d-d9148142fd87" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: catboost in /usr/local/lib/python3.7/dist-packages (1.0.6)\n", "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.21.6)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from catboost) (1.15.0)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from catboost) (1.4.1)\n", "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.7/dist-packages (from catboost) (1.3.5)\n", "Requirement already satisfied: graphviz in /usr/local/lib/python3.7/dist-packages (from catboost) (0.10.1)\n", "Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from catboost) (5.5.0)\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from catboost) (3.2.2)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2.8.2)\n", "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->catboost) (2022.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (1.4.2)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (0.11.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->catboost) (3.0.9)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->catboost) (4.2.0)\n", "Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly->catboost) (8.0.1)\n", "Collecting shap\n", " Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)\n", "\u001b[K |████████████████████████████████| 564 kB 5.0 MB/s \n", "\u001b[?25hRequirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from shap) (1.4.1)\n", "Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.7/dist-packages (from shap) (4.64.0)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from shap) (1.3.5)\n", "Requirement already satisfied: numba in /usr/local/lib/python3.7/dist-packages (from shap) (0.51.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from shap) (1.21.6)\n", "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.7/dist-packages (from shap) (1.3.0)\n", "Collecting slicer==0.0.7\n", " Downloading slicer-0.0.7-py3-none-any.whl (14 kB)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from shap) (1.0.2)\n", "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.7/dist-packages (from shap) (21.3)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>20.9->shap) (3.0.9)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from numba->shap) (57.4.0)\n", "Requirement already satisfied: llvmlite<0.35,>=0.34.0.dev0 in /usr/local/lib/python3.7/dist-packages (from numba->shap) (0.34.0)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->shap) (2.8.2)\n", "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->shap) (2022.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->shap) (1.15.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->shap) (3.1.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->shap) (1.1.0)\n", "Installing collected packages: slicer, shap\n", "Successfully installed shap-0.40.0 slicer-0.0.7\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import time\n", "\n", "import tensorflow as tf\n", "#tf.compat.v1.disable_v2_behavior() #Should be enabled when using DeepSHAP, see https://github.com/slundberg/shap/issues/2189\n", "from tensorflow import keras\n", "from scikeras.wrappers import KerasClassifier, KerasRegressor\n", "\n", "from sklearn.datasets import make_moons\n", "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n", "from sklearn.ensemble import StackingClassifier\n", "from sklearn.ensemble import BaggingClassifier, GradientBoostingRegressor, GradientBoostingClassifier\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold\n", "from sklearn.svm import SVC\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn import datasets\n", "from sklearn.metrics import mean_absolute_error\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "from xgboost import XGBClassifier\n", "from xgboost import XGBRegressor\n", "\n", "import lightgbm as lgb\n", "\n", "from catboost import CatBoostClassifier, CatBoostRegressor, Pool\n", "\n", "import shap\n", "shap.initjs() \n", "\n", "import matplotlib as mpl\n", "from matplotlib import pyplot as plt\n", "from matplotlib import cm\n", "%matplotlib inline" ], "metadata": { "id": "5bV_HvPiH-9i", "colab": { "base_uri": "https://localhost:8080/", "height": 43 }, "outputId": "71539579-11a7-4ce4-d6d2-ca247872f37c" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "def plot_dataset(X, y):\n", " plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n", " plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n", " #plt.axis(axes)\n", " plt.grid(True, which='both')\n", " plt.xlabel(\"$x_1$\")\n", " plt.ylabel(\"$x_2$\", rotation=0)\n", "\n", "def plot_decision_boundary(clf, X, y, alpha=1.0):\n", " axes=[-1.5, 2.4, -1, 1.5]\n", " x1, x2 = np.meshgrid(np.linspace(axes[0], axes[1], 100),\n", " np.linspace(axes[2], axes[3], 100))\n", " X_new = np.c_[x1.ravel(), x2.ravel()]\n", " y_pred = clf.predict(X_new).reshape(x1.shape)\n", " \n", " plt.contourf(x1, x2, y_pred, alpha=0.3 * alpha, cmap='Wistia')\n", " plt.contour(x1, x2, y_pred, cmap=\"Greys\", alpha=0.8 * alpha)\n", " colors = [\"#78785c\", \"#c47b27\"]\n", " markers = (\"o\", \"^\")\n", " for idx in (0, 1):\n", " plt.plot(X[:, 0][y == idx], X[:, 1][y == idx],\n", " color=colors[idx], marker=markers[idx], linestyle=\"none\")\n", " plt.axis(axes)\n", " plt.xlabel(r\"$x_1$\")\n", " plt.ylabel(r\"$x_2$\", rotation=0)" ], "metadata": { "id": "ruWWKAR_Mz5P" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Voting classifier" ], "metadata": { "id": "LTrx75jqLcLg" } }, { "cell_type": "markdown", "source": [ "`Scikit-Learn` provides a `VotingClassifier` class that’s quite easy to use: just give it a list of name/predictor pairs, and use it like a normal classifier, that’s it! Let’s try it on the moons dataset (this is a toy dataset for binary classification in which the data points are shaped as two interleaving crescent moons). We will load and split the moons dataset into a training set and a test set, then we’ll create and train a voting classifier composed of three diverse classifiers:" ], "metadata": { "id": "PvL__u5fL5Sw" } }, { "cell_type": "code", "source": [ "X, y = make_moons(n_samples=500, noise=0.30, random_state=42)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", "plot_dataset(X, y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 281 }, "id": "-AB6jsQHLfpk", "outputId": "f4b4489f-8b3b-4f2e-942a-49641835aa4a" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "Here we use [`SciKeras`](https://github.com/adriangb/scikeras) to wrap kerase model into `Scikit-Learn`." ], "metadata": { "id": "qmJ0_x9ezBAO" } }, { "cell_type": "code", "source": [ "def get_model():\n", " model= keras.models.Sequential([keras.layers.Dense(30,activation='relu',input_shape=[2]),\n", " keras.layers.Dense(20,activation='relu'),\n", " keras.layers.Dense(1,activation='sigmoid')\n", " ])\n", " model.compile(optimizer='NAdam',loss='binary_crossentropy',metrics=['accuracy'])\n", " return model" ], "metadata": { "id": "DSARLteqwsY-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "dense_model = KerasClassifier(model=get_model, epochs=200, verbose=False)" ], "metadata": { "id": "mufRkMoGxOuu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "voting_clf = VotingClassifier(\n", " estimators=[\n", " ('lr', LogisticRegression(random_state=42)),\n", " ('rf', RandomForestClassifier(random_state=42)),\n", " ('svc', SVC(random_state=42)),\n", " ('dense', dense_model)\n", " ]\n", ")\n", "voting_clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z2-c2OA5M4aG", "outputId": "133a4343-5ec0-45d7-e7f3-7b8330d9a83d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),\n", " ('rf', RandomForestClassifier(random_state=42)),\n", " ('svc', SVC(random_state=42)),\n", " ('dense',\n", " KerasClassifier(epochs=200, model=, verbose=False))])" ] }, "metadata": {}, "execution_count": 109 } ] }, { "cell_type": "markdown", "source": [ "When you fit a `VotingClassifier`, it clones every estimator and fits the clones. The original estimators are available via the `estimators` attribute, while the fitted clones are available via the `estimators_` attribute. If you prefer a dict rather than a list, you can use `named_estimators` or `named_estimators_` instead. For example, let’s look at each fitted classifier’s accuracy on the test set:" ], "metadata": { "id": "HMfYwrvVNeH3" } }, { "cell_type": "code", "source": [ "for name, clf in voting_clf.named_estimators_.items():\n", " print(name, \"=\", clf.score(X_test, y_test))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1Y1o7PFQNIPg", "outputId": "7853f87b-55f2-4ab1-8482-33fbef160914" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "lr = 0.864\n", "rf = 0.896\n", "svc = 0.896\n", "dense = 0.896\n" ] } ] }, { "cell_type": "markdown", "source": [ "When you call the voting classifier’s `predict()` method, it performs hard voting. For example, the voting classifier predicts class 1 for the first instance of the test set, because 3 out of 4 classifiers predict that class:" ], "metadata": { "id": "pcfc8mtjNywf" } }, { "cell_type": "code", "source": [ "voting_clf.predict(X_test[:1]), [clf.predict(X_test[:1]) for clf in voting_clf.estimators_]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2TrG5WLoNvGl", "outputId": "26dedde2-204a-469f-d971-b87ad8dc555d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(array([1]), [array([1]), array([1]), array([0]), array([1])])" ] }, "metadata": {}, "execution_count": 111 } ] }, { "cell_type": "markdown", "source": [ "Now let’s look at the performance of the voting classifier on the test set:" ], "metadata": { "id": "FfwmQiZiN-7P" } }, { "cell_type": "code", "source": [ "voting_clf.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R-4gZYXnN3Q0", "outputId": "c64c50e5-7c31-4e8b-d8c3-6ca1e6fb6021" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.88" ] }, "metadata": {}, "execution_count": 112 } ] }, { "cell_type": "code", "source": [ "plot_decision_boundary(voting_clf, X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 285 }, "id": "vQznlFPfU6aY", "outputId": "0fe2bafe-b182-4970-83fc-4ed8decdaf39" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "If all classifiers are able to estimate class probabilities (i.e., they all have a `predict_proba()` method), then you can tell `Scikit-Learn` to predict the class with the **highest class probability, averaged over all the individual classifiers.** This is called **soft voting**. It often achieves higher performance than hard voting because it gives more weight to highly confident votes. All you need to do is set the voting classifier’s voting hyperparameter to \"soft\", and ensure that all classifiers can estimate class probabilities. \n", "\n", "This is not the case for the SVC class by default, so you need to set its probability hyperparameter to True (this will make the SVC class use cross-validation to estimate class probabilities, slowing down training, and it will add a `predict_proba()` method). Let’s try that:" ], "metadata": { "id": "0OCmMXDAOEsI" } }, { "cell_type": "code", "source": [ "voting_clf.voting = \"soft\"\n", "voting_clf.named_estimators[\"svc\"].probability = True\n", "voting_clf.fit(X_train, y_train)\n", "voting_clf.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tex4mcrIOBQ2", "outputId": "5dde7cd6-8849-41c6-e165-5a21e7e2aa00" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.912" ] }, "metadata": {}, "execution_count": 114 } ] }, { "cell_type": "markdown", "source": [ "We reach 91.2% accuracy simply by using soft voting, not bad!" ], "metadata": { "id": "I__SqQ0lOdco" } }, { "cell_type": "markdown", "source": [ "For stacking neural ntwork model, you can refer to https://ensemble-pytorch.readthedocs.io/en/latest/ for more details." ], "metadata": { "id": "R7wMvsUO8t9V" } }, { "cell_type": "markdown", "source": [ "## Stacking" ], "metadata": { "id": "PBHcV002qZ1u" } }, { "cell_type": "markdown", "source": [ "`Scikit-Learn` provides two classes for stacking ensembles: `StackingClassifier` and `StackingRegressor`. For example, you can replace the `VotingClassifier` you used on the moons dataset with a `StackingClassifier`:" ], "metadata": { "id": "DP5DoUCdqbaN" } }, { "cell_type": "code", "source": [ "stacking_clf = StackingClassifier(\n", " estimators=[\n", " ('lr', LogisticRegression(random_state=42)),\n", " ('rf', RandomForestClassifier(random_state=42)),\n", " ('svc', SVC(probability=True, random_state=42))\n", " ],\n", " final_estimator=RandomForestClassifier(random_state=43),\n", " cv=5 # number of cross-validation folds\n", ")\n", "stacking_clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1rqpE6FZqlN1", "outputId": "6de014a5-d7c7-4bac-d60b-6c42c6644494" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "StackingClassifier(cv=5,\n", " estimators=[('lr', LogisticRegression(random_state=42)),\n", " ('rf', RandomForestClassifier(random_state=42)),\n", " ('svc', SVC(probability=True, random_state=42))],\n", " final_estimator=RandomForestClassifier(random_state=43))" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "markdown", "source": [ "For each predictor, the stacking classifier will call `predict_proba()` if available, or it will fallback to `decision_function()` if available, or as a last resort it will call `predict()`. If you don’t provide a final estimator, `StackingClassifier` will use `LogisticRegression`, and `StackingRegressor` will use `RidgeCV`." ], "metadata": { "id": "08TA3D47qsWN" } }, { "cell_type": "code", "source": [ "stacking_clf.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7ms6ZeGCq4_W", "outputId": "6147afb3-260d-491c-d36c-dbe82cd0c620" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.928" ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "markdown", "source": [ "You get 92.8% accuracy! which is a bit better than the voting classifier using soft voting, which got 92%." ], "metadata": { "id": "pIeOuOsOq_N-" } }, { "cell_type": "markdown", "source": [ "## Baaging and Pasting" ], "metadata": { "id": "I97bOx8CTlf3" } }, { "cell_type": "markdown", "source": [ "`Scikit-Learn` offers a simple API for both bagging and pasting with the `BaggingClassifier` class (or `BaggingRegressor` for regression). The following code trains an ensemble of 500 **Decision Tree classifiers**:⁠ each is trained on 100 training instances randomly sampled from the training set with replacement (this is an example of bagging, but if you want to use pasting instead, just set `bootstrap=False`). The `n_jobs` parameter tells `Scikit-Learn` the number of CPU cores to use for training and predictions, and –1 tells `Scikit-Learn` to use all available cores." ], "metadata": { "id": "QzERUKToTov_" } }, { "cell_type": "code", "source": [ "bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,\n", " max_samples=100, random_state=42)\n", "bag_clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x43B6MDAObRX", "outputId": "5e4c6379-1a36-497d-a99f-f949e3e1c8fa" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,\n", " n_estimators=500, random_state=42)" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "source": [ "Notice that the `BaggingClassifier` automatically performs **soft voting** instead of hard voting if the base classifier can estimate class probabilities (i.e., if it has a `predict_proba()` method), which is the case with Decision Tree classifiers." ], "metadata": { "id": "BF7-HM3KT99Q" } }, { "cell_type": "markdown", "source": [ "We compares the decision boundary of a single Decision Tree with the decision boundary of a bagging ensemble of 500 trees (from the preceding code), both trained on the moons dataset. As you can see, the ensemble’s predictions will likely generalize much better than the single Decision Tree’s predictions: the ensemble has a comparable bias but a smaller variance (it makes roughly the same number of errors on the training set, but the decision boundary is less irregular)." ], "metadata": { "id": "4lhj-r7nUMn3" } }, { "cell_type": "code", "source": [ "tree_clf = DecisionTreeClassifier(random_state=42)\n", "tree_clf.fit(X_train, y_train)\n", "\n", "fig, axes = plt.subplots(ncols=2, figsize=(10, 4), sharey=True)\n", "plt.sca(axes[0])\n", "plot_decision_boundary(tree_clf, X_train, y_train)\n", "plt.title(\"Decision Tree\")\n", "plt.sca(axes[1])\n", "plot_decision_boundary(bag_clf, X_train, y_train)\n", "plt.title(\"Decision Trees with Bagging\")\n", "plt.ylabel(\"\")\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "8wOuoYCmUMG2", "outputId": "cb3cf50f-611e-477e-cf95-6c10cd6c1df5" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "### Out-of-Bag evaluation" ], "metadata": { "id": "9HiUsZa_VGBw" } }, { "cell_type": "markdown", "source": [ "It can be shown mathematically that only about 63% of the training instances are sampled on average for each predictor.⁠6 The remaining 37% of the training instances that are not sampled are called out-of-bag (oob) instances. Note that they are not the same 37% for all predictors. The following calculate this number when `m=1000`:" ], "metadata": { "id": "LXRxTOnAa2Ev" } }, { "cell_type": "code", "source": [ "print(1 - (1 - 1 / 1000) ** 1000)\n", "print(1 - np.exp(-1))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t_pOXuBaVRwI", "outputId": "1104123d-7f8b-4160-f868-f6b17a2ee36c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.6323045752290363\n", "0.6321205588285577\n" ] } ] }, { "cell_type": "markdown", "source": [ "In `Scikit-Learn`, you can set `oob_score=True` when creating a `BaggingClassifier` to request an automatic oob evaluation after training. The following code demonstrates this. The resulting evaluation score is available in the `oob_score_` attribute:" ], "metadata": { "id": "aMg1xzaMVHfr" } }, { "cell_type": "code", "source": [ "bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,\n", " oob_score=True, n_jobs=-1, random_state=42)\n", "bag_clf.fit(X_train, y_train)\n", "bag_clf.oob_score_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MpOmKptvT65-", "outputId": "418d2dc7-8f3e-4379-cf07-be3a423772cb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.896" ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "markdown", "source": [ "According to this oob evaluation, this `BaggingClassifier` is likely to achieve about 89.6% accuracy on the test set. Let’s verify this:" ], "metadata": { "id": "qjCSrZaaakSn" } }, { "cell_type": "code", "source": [ "y_pred = bag_clf.predict(X_test)\n", "accuracy_score(y_test, y_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C5GIYPd1an13", "outputId": "65ebf350-b3f7-4885-cd6b-3ef10aa70a76" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.92" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "markdown", "source": [ "We get 92% accuracy on the test. The oob evaluation was a bit too pessimistic, a bit over 2% too low." ], "metadata": { "id": "OmiahqHWaqqo" } }, { "cell_type": "markdown", "source": [ "The `BaggingClassifier` class supports **sampling the features** as well. Sampling is controlled by two hyperparameters: `max_features` and `bootstrap_features`. They work the same way as `max_samples` and `bootstrap`, but for feature sampling instead of instance sampling. Thus, each predictor will be trained on a random subset of the input features.\n", "\n", "This technique is particularly useful when you are dealing with high-dimensional inputs (such as images). **Sampling both training instances and features is called the Random Patches method**.⁠ Keeping all training instances (by setting `bootstrap=False` and `max_samples=1.0`) but sampling features (by setting `bootstrap_features` to True and/or `max_features` to a value smaller than 1.0) is called the **Random Subspaces method**.⁠" ], "metadata": { "id": "rva-jGH6bOmw" } }, { "cell_type": "markdown", "source": [ "### Random forest" ], "metadata": { "id": "9oWyFDYpdIwf" } }, { "cell_type": "markdown", "source": [ "Random Forest9 is an ensemble of Decision Trees, generally trained via the bagging method (or sometimes pasting), **typically with `max_samples` set to the size of the training set.** Instead of building a `BaggingClassifier` and passing it a `DecisionTreeClassifier`, you can use the `RandomForestClassifier` class, which is more convenient and **optimized for Decision Trees** (similarly, there is a `RandomForestRegressor` class for regression tasks). The follwong `BaggingClassifier` is equivalent to random forest\n", "\n", "```python\n", "bag_clf = BaggingClassifier(\n", " DecisionTreeClassifier(max_features=\"sqrt\", max_leaf_nodes=16),\n", " n_estimators=500, n_jobs=-1, random_state=42)\n", "```\n", "\n", "The following code trains a Random Forest classifier with 500 trees, each limited to maximum 16 nodes, and using all available CPU cores:" ], "metadata": { "id": "j_TkqrLMdMT4" } }, { "cell_type": "code", "source": [ "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)\n", "rnd_clf.fit(X_train, y_train)\n", "y_pred_rf = rnd_clf.predict(X_test)" ], "metadata": { "id": "rBo1gB1MVWwI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "y_pred = rnd_clf.predict(X_test)\n", "accuracy_score(y_test, y_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "X5-UVNb5jk-H", "outputId": "63b8fb73-d95f-47f4-dcb5-fe0150c9e8d9" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.912" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "markdown", "source": [ "You can also create an Extra-Trees classifier using `Scikit-Learn`’s `ExtraTreesClassifier` class. Its API is identical to the `RandomForestClassifier` class, except bootstrap defaults to False. Similarly, the `ExtraTreesRegressor` class has the same API as the RandomForestRegressor class, except bootstrap defaults to False." ], "metadata": { "id": "9x22GreAfOSv" } }, { "cell_type": "markdown", "source": [ "## AdaBoost" ], "metadata": { "id": "Ro3Mdc0UjuAe" } }, { "cell_type": "markdown", "source": [ "Scikit-Learn uses a multiclass version of AdaBoost called `SAMME`⁠ (which stands for Stagewise Additive Modeling using a Multiclass Exponential loss function). When there are just two classes, SAMME is equivalent to AdaBoost. If the predictors can estimate class probabilities (i.e., if they have a `predict_proba()` method), Scikit-Learn can use a variant of SAMME called SAMME.R (the R stands for “Real”), which relies on class probabilities rather than predictions and generally performs better." ], "metadata": { "id": "bS8rugGHkMov" } }, { "cell_type": "markdown", "source": [ "The following code trains an AdaBoost classifier based on 30 Decision Stumps using `Scikit-Learn`’s `AdaBoostClassifier` class (as you might expect, there is also an `AdaBoostRegressor` class). A Decision Stump is a Decision Tree with `max_depth=1`—in other words, **a tree composed of a single decision node** plus two leaf nodes. This is the default base estimator for the `AdaBoostClassifier` class:" ], "metadata": { "id": "Pmgz2OtNkX-n" } }, { "cell_type": "code", "source": [ "ada_clf = AdaBoostClassifier(\n", " DecisionTreeClassifier(max_depth=1), n_estimators=30,\n", " learning_rate=0.5, random_state=42)\n", "ada_clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aDhLJhmhdfw2", "outputId": "7b365b79-8e1d-4ac6-ab4a-8dcd7433cf96" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),\n", " learning_rate=0.5, n_estimators=30, random_state=42)" ] }, "metadata": {}, "execution_count": 115 } ] }, { "cell_type": "code", "source": [ "plot_decision_boundary(ada_clf, X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 285 }, "id": "Gsh_380VkHX_", "outputId": "20d453fc-efc5-4ea5-c391-af74e94502fd" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "## Gradient Boosting" ], "metadata": { "id": "91HFYHBB-Bmv" } }, { "cell_type": "markdown", "source": [ "First, let’s generate a noisy quadratic dataset and fit a `DecisionTreeRegressor`to it:" ], "metadata": { "id": "2uM9bAyb-D1m" } }, { "cell_type": "code", "source": [ "np.random.seed(42)\n", "X = np.random.rand(100, 1) - 0.5\n", "y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100) # y = 3x² + Gaussian noise\n", "\n", "tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", "tree_reg1.fit(X, y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LgoG2Cg58-oC", "outputId": "a702998b-7ece-466f-a438-3d2ada3b4f24" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DecisionTreeRegressor(max_depth=2, random_state=42)" ] }, "metadata": {}, "execution_count": 119 } ] }, { "cell_type": "markdown", "source": [ "A simpler way to train GBRT ensembles is to use Scikit-Learn’s `GradientBoostingRegressor` class (there’s also a `GradientBoostingClassifier` class for classification). Much like the `RandomForestRegressor` class, it has hyperparameters to control the growth of Decision Trees (e.g., `max_depth`, `min_samples_leaf`), as well as hyperparameters to control the ensemble training, such as the number of trees (`n_estimator`s). " ], "metadata": { "id": "PXBmpTZ6-TUf" } }, { "cell_type": "code", "source": [ "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)\n", "gbrt.fit(X, y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QUJv2gs2-pmE", "outputId": "016aca12-5341-403d-ede4-03d9ddf357ad" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3,\n", " random_state=42)" ] }, "metadata": {}, "execution_count": 122 } ] }, { "cell_type": "markdown", "source": [ "The `learning_rate` hyperparameter scales the contribution of each tree. If you set it to a low value, such as 0.05, you will need more trees in the ensemble to fit the training set, but the predictions will usually generalize better. This is a regularization technique called shrinkage.\n", "\n", "To find the optimal number of trees, you could perform cross-validation using `GridSearchCV` or `RandomizedSearchCV`, as usual, but there’s a simpler way: if you set the `n_iter_no_change` hyperparameter to an integer value, say 10, then the `GradientBoostingRegressor` will automatically stop adding more trees during training if it sees that the last 10 trees didn’t help. This is simply early stopping, but with a little bit of patience: it tolerates having no progress for a few iterations before it stops. Let’s train the ensemble using early stopping:" ], "metadata": { "id": "HONBXa5P--iv" } }, { "cell_type": "code", "source": [ "gbrt_best = GradientBoostingRegressor(\n", " max_depth=2, learning_rate=0.05, n_estimators=500,\n", " n_iter_no_change=10, random_state=42)\n", "gbrt_best.fit(X, y)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hSZK_Cvb-zke", "outputId": "c4b43dca-b453-4619-c547-b0bb44bb4c50" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "GradientBoostingRegressor(learning_rate=0.05, max_depth=2, n_estimators=500,\n", " n_iter_no_change=10, random_state=42)" ] }, "metadata": {}, "execution_count": 123 } ] }, { "cell_type": "markdown", "source": [ "If you set `n_iter_no_change` too low, training may stop too early and the model will underfit. But if you set it too high, it will overfit instead. We also set a fairly small learning rate and a high number of estimators, but the actual number of estimators in the trained ensemble is much lower, thanks to early stopping:" ], "metadata": { "id": "-JbdlvKh_hwO" } }, { "cell_type": "code", "source": [ "gbrt_best.n_estimators_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8zloEgk6_UJj", "outputId": "77810466-8ed3-4d26-848c-2faea98bc8b4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "92" ] }, "metadata": {}, "execution_count": 124 } ] }, { "cell_type": "markdown", "source": [ "When `n_iter_no_change` is set, the `fit(`) method automatically splits the training set into a smaller training set and a validation set: this allows it to evaluate the model’s performance each time it adds a new tree. The size of the validation set is controlled by the `validation_fraction` hyperparameter, which is 10% by default. The tol hyperparameter determines the maximum performance improvement that still counts as negligible. It defaults to 0.0001." ], "metadata": { "id": "L-vULKsT_mEO" } }, { "cell_type": "markdown", "source": [ "The GradientBoostingRegressor class also supports a subsample hyperparameter, which specifies the fraction of training instances to be used for training each tree. For example, if `subsample=0.25`, then each tree is trained on 25% of the training instances, selected randomly. As you can probably guess by now, this technique trades a higher bias for a lower variance. It also speeds up training considerably. This is called **Stochastic Gradient Boosting**." ], "metadata": { "id": "E8JMjIRF_xdd" } }, { "cell_type": "markdown", "source": [ "## XGBoost" ], "metadata": { "id": "KHUBGzQqZRhM" } }, { "cell_type": "markdown", "source": [ "### Classification task" ], "metadata": { "id": "HIsGDU0-bB-a" } }, { "cell_type": "markdown", "source": [ "Here are the essential steps to build an XGBoost classification model in scikit-learn using cross-validation." ], "metadata": { "id": "l41vqfEkbKcK" } }, { "cell_type": "code", "source": [ "iris = datasets.load_iris()\n", "df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])\n", "df.head()" ], "metadata": { "id": "ZVg6wzfA_Vyq", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "6a21e676-4e2b-44e6-f754-6b1c55ed0d58" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", "0 5.1 3.5 1.4 0.2 \n", "1 4.9 3.0 1.4 0.2 \n", "2 4.7 3.2 1.3 0.2 \n", "3 4.6 3.1 1.5 0.2 \n", "4 5.0 3.6 1.4 0.2 \n", "\n", " target \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)target
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 38 } ] }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=42)" ], "metadata": { "id": "G3R0nCm-Z4ly" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "The following template is for building an XGBoost classifier" ], "metadata": { "id": "Q6wn7paXZpj-" } }, { "cell_type": "code", "source": [ "xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', \n", " learning_rate=0.1, n_estimators=100, random_state=42, n_jobs=-1)" ], "metadata": { "id": "rsjJb0POZ8Y7" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "1. `booster='gbtree'`: The booster is the base learner. It's the machine learning model that is constructed during every round of boosting. You may have guessed that 'gbtree' stands for gradient boosted tree, the XGBoost default base learner. It's uncommon but possible to work with other base learners, \n", "\n", "2. `objective='multi:softprob'`: Standard options for the objective can be viewed in the XGBoost official documentation, https://xgboost.readthedocs.io/en/latest/parameter.html, under Learning Task Parameters. The multi:softprob objective is a standard alternative to binary:logistic when the dataset includes multiple classes. It computes the probabilities of classification and chooses the highest one. If not explicitly stated, XGBoost will often find the right objective for you.\n", "\n", "3. `max_depth=6`: The max_depth of a tree determines the number of branches each tree has. It's one of the most important hyperparameters in making balanced predictions. XGBoost uses a default of 6, unlike random forests, which don't provide a value unless explicitly programmed.\n", "\n", "3. `learning_rate=0.1`: Within XGBoost, this hyperparameter is often referred to as eta. This hyperparameter limits the variance by reducing the weight of each tree to the given percentage. \n", "\n", "4. `n_estimators=100`: Popular among ensemble methods, `n_estimators` is the number of boosted trees in the model. Increasing this number while decreasing `learning_rate` can lead to more robust results." ], "metadata": { "id": "rDCT_1p9aNBi" } }, { "cell_type": "code", "source": [ "xgb.fit(X_train, y_train)\n", "y_pred = xgb.predict(X_test)\n", "score = accuracy_score(y_pred, y_test)\n", "print('Score: ' + str(score))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QYLOzVdMazA5", "outputId": "e270ed09-e3f4-4ade-d8df-cc3f1fd6433d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 1.0\n" ] } ] }, { "cell_type": "code", "source": [ "xgb.get_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Q8LM-RkUQLDd", "outputId": "cd49710a-45fa-4a2e-ce82-860b66c41fa0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'base_score': 0.5,\n", " 'booster': 'gbtree',\n", " 'callbacks': None,\n", " 'colsample_bylevel': 1,\n", " 'colsample_bynode': 1,\n", " 'colsample_bytree': 1,\n", " 'early_stopping_rounds': None,\n", " 'enable_categorical': False,\n", " 'eval_metric': None,\n", " 'gamma': 0,\n", " 'gpu_id': -1,\n", " 'grow_policy': 'depthwise',\n", " 'importance_type': None,\n", " 'interaction_constraints': '',\n", " 'learning_rate': 0.1,\n", " 'max_bin': 256,\n", " 'max_cat_to_onehot': 4,\n", " 'max_delta_step': 0,\n", " 'max_depth': 6,\n", " 'max_leaves': 0,\n", " 'min_child_weight': 1,\n", " 'missing': nan,\n", " 'monotone_constraints': '()',\n", " 'n_estimators': 100,\n", " 'n_jobs': -1,\n", " 'num_parallel_tree': 1,\n", " 'objective': 'multi:softprob',\n", " 'predictor': 'auto',\n", " 'random_state': 42,\n", " 'reg_alpha': 0,\n", " 'reg_lambda': 1,\n", " 'sampling_method': 'uniform',\n", " 'scale_pos_weight': None,\n", " 'subsample': 1,\n", " 'tree_method': 'exact',\n", " 'use_label_encoder': False,\n", " 'validate_parameters': 1,\n", " 'verbosity': None}" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "markdown", "source": [ "### Regression task" ], "metadata": { "id": "e4LM00_-bExu" } }, { "cell_type": "markdown", "source": [ "Here are the essential steps to build an XGBoost regression model in scikit-learn using cross-validation." ], "metadata": { "id": "7uFOM-SqbIZ6" } }, { "cell_type": "code", "source": [ "X,y = datasets.load_diabetes(return_X_y=True)\n", "\n", "xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror', \n", " learning_rate=0.1, n_estimators=100, random_state=42, n_jobs=-1)\n", "\n", "scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=5)\n", "\n", "# Take square root of the scores\n", "rmse = np.sqrt(-scores)\n", "\n", "# Display accuracy\n", "print('RMSE:', np.round(rmse, 3))\n", "\n", "# Display mean score\n", "print('RMSE mean: %0.3f' % (rmse.mean()))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "q5w9Jclca8Gs", "outputId": "7d67260e-3a04-4ba2-84f6-b95dc8227cf1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "RMSE: [63.033 59.689 64.538 63.699 64.661]\n", "RMSE mean: 63.124\n" ] } ] }, { "cell_type": "code", "source": [ "xgb.fit(X,y)\n", "xgb.get_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CbihBstGRQom", "outputId": "ce8ceff2-0207-4fac-e3c4-050b039028ad" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'base_score': 0.5,\n", " 'booster': 'gbtree',\n", " 'callbacks': None,\n", " 'colsample_bylevel': 1,\n", " 'colsample_bynode': 1,\n", " 'colsample_bytree': 1,\n", " 'early_stopping_rounds': None,\n", " 'enable_categorical': False,\n", " 'eval_metric': None,\n", " 'gamma': 0,\n", " 'gpu_id': -1,\n", " 'grow_policy': 'depthwise',\n", " 'importance_type': None,\n", " 'interaction_constraints': '',\n", " 'learning_rate': 0.1,\n", " 'max_bin': 256,\n", " 'max_cat_to_onehot': 4,\n", " 'max_delta_step': 0,\n", " 'max_depth': 6,\n", " 'max_leaves': 0,\n", " 'min_child_weight': 1,\n", " 'missing': nan,\n", " 'monotone_constraints': '()',\n", " 'n_estimators': 100,\n", " 'n_jobs': -1,\n", " 'num_parallel_tree': 1,\n", " 'objective': 'reg:squarederror',\n", " 'predictor': 'auto',\n", " 'random_state': 42,\n", " 'reg_alpha': 0,\n", " 'reg_lambda': 1,\n", " 'sampling_method': 'uniform',\n", " 'scale_pos_weight': 1,\n", " 'subsample': 1,\n", " 'tree_method': 'exact',\n", " 'validate_parameters': 1,\n", " 'verbosity': None}" ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "markdown", "source": [ "Without a baseline of comparison, we have no idea what that score means. Converting the target column, `y`, into a pandas DataFrame with the `.describe()` method will give the quartiles and the general statistics of the predictor column, as follows:" ], "metadata": { "id": "HAY1SjZxboMz" } }, { "cell_type": "code", "source": [ "pd.DataFrame(y).describe()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "F35eRsafbXnV", "outputId": "93fd4504-2aba-439f-e9b1-d85ed4f768fb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " 0\n", "count 442.000000\n", "mean 152.133484\n", "std 77.093005\n", "min 25.000000\n", "25% 87.000000\n", "50% 140.500000\n", "75% 211.500000\n", "max 346.000000" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
count442.000000
mean152.133484
std77.093005
min25.000000
25%87.000000
50%140.500000
75%211.500000
max346.000000
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "markdown", "source": [ "A score of 63.124 is less than 1 standard deviation, a respectable result." ], "metadata": { "id": "gqIRc8v6bubr" } }, { "cell_type": "markdown", "source": [ "### Speed comparsion" ], "metadata": { "id": "FLDE-3-RfD4c" } }, { "cell_type": "markdown", "source": [ "Let's now compare `GradientBoostingClassifier` and `XGBoostClassifier` with the [exoplanet dataset](https://www.kaggle.com/datasets/keplersmachines/kepler-labelled-time-series-data) for its speed " ], "metadata": { "id": "vKnuqcvafF1c" } }, { "cell_type": "code", "source": [ "!wget https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/raw/master/Chapter04/exoplanets.csv" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YLKF2oiTbtbn", "outputId": "f23977cc-bfed-44af-ffe2-48f5252e0b51" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2022-05-21 11:00:29-- https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/raw/master/Chapter04/exoplanets.csv\n", "Resolving github.com (github.com)... 140.82.114.4\n", "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://media.githubusercontent.com/media/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/master/Chapter04/exoplanets.csv [following]\n", "--2022-05-21 11:00:29-- https://media.githubusercontent.com/media/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/master/Chapter04/exoplanets.csv\n", "Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 262223348 (250M) [text/plain]\n", "Saving to: ‘exoplanets.csv’\n", "\n", "exoplanets.csv 100%[===================>] 250.08M 193MB/s in 1.3s \n", "\n", "2022-05-21 11:00:34 (193 MB/s) - ‘exoplanets.csv’ saved [262223348/262223348]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "df = pd.read_csv('exoplanets.csv')\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "PqwtKb68dZi7", "outputId": "c02e13ce-5922-46c1-bea3-86fe790b8a16" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " LABEL FLUX.1 FLUX.2 FLUX.3 FLUX.4 FLUX.5 FLUX.6 FLUX.7 \\\n", "0 2 93.85 83.81 20.10 -26.98 -39.56 -124.71 -135.18 \n", "1 2 -38.88 -33.83 -58.54 -40.09 -79.31 -72.81 -86.55 \n", "2 2 532.64 535.92 513.73 496.92 456.45 466.00 464.50 \n", "3 2 326.52 347.39 302.35 298.13 317.74 312.70 322.33 \n", "4 2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34 \n", "\n", " FLUX.8 FLUX.9 ... FLUX.3188 FLUX.3189 FLUX.3190 FLUX.3191 \\\n", "0 -96.27 -79.89 ... -78.07 -102.15 -102.15 25.13 \n", "1 -85.33 -83.97 ... -3.28 -32.21 -32.21 -24.89 \n", "2 486.39 436.56 ... -71.69 13.31 13.31 -29.89 \n", "3 311.31 312.42 ... 5.71 -3.73 -3.73 30.05 \n", "4 -1022.71 -989.57 ... -594.37 -401.66 -401.66 -357.24 \n", "\n", " FLUX.3192 FLUX.3193 FLUX.3194 FLUX.3195 FLUX.3196 FLUX.3197 \n", "0 48.57 92.54 39.32 61.42 5.08 -39.54 \n", "1 -4.86 0.76 -11.70 6.46 16.00 19.93 \n", "2 -20.88 5.06 -11.80 -28.91 -70.02 -96.67 \n", "3 20.03 -12.67 -8.77 -17.31 -17.35 13.98 \n", "4 -443.76 -438.54 -399.71 -384.65 -411.79 -510.54 \n", "\n", "[5 rows x 3198 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LABELFLUX.1FLUX.2FLUX.3FLUX.4FLUX.5FLUX.6FLUX.7FLUX.8FLUX.9...FLUX.3188FLUX.3189FLUX.3190FLUX.3191FLUX.3192FLUX.3193FLUX.3194FLUX.3195FLUX.3196FLUX.3197
0293.8583.8120.10-26.98-39.56-124.71-135.18-96.27-79.89...-78.07-102.15-102.1525.1348.5792.5439.3261.425.08-39.54
12-38.88-33.83-58.54-40.09-79.31-72.81-86.55-85.33-83.97...-3.28-32.21-32.21-24.89-4.860.76-11.706.4616.0019.93
22532.64535.92513.73496.92456.45466.00464.50486.39436.56...-71.6913.3113.31-29.89-20.885.06-11.80-28.91-70.02-96.67
32326.52347.39302.35298.13317.74312.70322.33311.31312.42...5.71-3.73-3.7330.0520.03-12.67-8.77-17.31-17.3513.98
42-1107.21-1112.59-1118.95-1095.10-1057.55-1034.48-998.34-1022.71-989.57...-594.37-401.66-401.66-357.24-443.76-438.54-399.71-384.65-411.79-510.54
\n", "

5 rows × 3198 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 51 } ] }, { "cell_type": "code", "source": [ "# Split data into X and y\n", "X = df.iloc[:,1:]\n", "y = df.iloc[:,0]\n", "\n", "# Split data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)" ], "metadata": { "id": "NiW1d_VCdk7t" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=42)\n", "gbr.fit(X_train, y_train)\n", "y_pred = gbr.predict(X_test)\n", "score = accuracy_score(y_pred, y_test)\n", "print('Score: ' + str(score))\n", "\n", "end = time.time()\n", "elapsed = end - start\n", "\n", "print('Run Time: ' + str(elapsed) + ' seconds')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1Oslw4JXer5J", "outputId": "44836778-7c2a-4e79-e1a9-c240a978c260" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 0.9874213836477987\n", "Run Time: 248.6392641067505 seconds\n" ] } ] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "# Instantiate the XGBRegressor, xg_reg\n", "xg_reg = XGBClassifier(n_estimators=100, max_depth=2, random_state=42)\n", "\n", "# class column has to start from 0 (as required since version 1.3.2).\n", "le = LabelEncoder()\n", "y_train = le.fit_transform(y_train)\n", "y_test = le.fit_transform(y_test)\n", "# Fit xg_reg to training set\n", "xg_reg.fit(X_train, y_train)\n", "\n", "# Predict labels of test set, y_pred\n", "y_pred = xg_reg.predict(X_test)\n", "\n", "score = accuracy_score(y_pred, y_test)\n", "\n", "print('Score: ' + str(score))\n", "\n", "end = time.time()\n", "elapsed = end - start\n", "\n", "print('Run Time: ' + str(elapsed) + ' seconds')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KWi-TIeee2BU", "outputId": "9c05b685-c9ee-4502-b4af-b405459c22b4" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 0.9913522012578616\n", "Run Time: 52.40076756477356 seconds\n" ] } ] }, { "cell_type": "markdown", "source": [ "When it comes to big data, an algorithm six as fast can save weeks or months of computational time and resources! This advantage is huge in the world of big data. In the world of boosting, XGBoost is the model of choice due to its unparalleled speed and impressive accuracy." ], "metadata": { "id": "jmX7g10ofdRD" } }, { "cell_type": "markdown", "source": [ "### Hyperparameter" ], "metadata": { "id": "AT6OUUregRfk" } }, { "cell_type": "markdown", "source": [ "XGBoost has many hyperparameters. XGBoost base learner hyperparameters incorporate all decision tree hyperparameters as a starting point. There are gradient boosting hyperparameters, since XGBoost is an enhanced version of gradient boosting." ], "metadata": { "id": "_YB4388QgiQr" } }, { "cell_type": "code", "source": [ "!wget https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/raw/master/Chapter06/heart_disease.csv" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H-GTv5RtffvG", "outputId": "2c48abfd-5517-41c8-fe37-188b1b2b059d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2022-05-21 05:49:41-- https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/raw/master/Chapter06/heart_disease.csv\n", "Resolving github.com (github.com)... 140.82.121.3\n", "Connecting to github.com (github.com)|140.82.121.3|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://media.githubusercontent.com/media/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/master/Chapter06/heart_disease.csv [following]\n", "--2022-05-21 05:49:41-- https://media.githubusercontent.com/media/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/master/Chapter06/heart_disease.csv\n", "Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 11328 (11K) [text/plain]\n", "Saving to: ‘heart_disease.csv.1’\n", "\n", "heart_disease.csv.1 100%[===================>] 11.06K --.-KB/s in 0s \n", "\n", "2022-05-21 05:49:41 (101 MB/s) - ‘heart_disease.csv.1’ saved [11328/11328]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "df = pd.read_csv('heart_disease.csv')\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "Q6qyxyV5guIL", "outputId": "59a4af8c-697f-4897-eab8-7ee096d3bae1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n", "0 63 1 3 145 233 1 0 150 0 2.3 0 \n", "1 37 1 2 130 250 0 1 187 0 3.5 0 \n", "2 41 0 1 130 204 0 0 172 0 1.4 2 \n", "3 56 1 1 120 236 0 1 178 0 0.8 2 \n", "4 57 0 0 120 354 0 1 163 1 0.6 2 \n", "\n", " ca thal target \n", "0 0 1 1 \n", "1 0 2 1 \n", "2 0 2 1 \n", "3 0 2 1 \n", "4 0 2 1 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063131452331015002.30011
137121302500118703.50021
241011302040017201.42021
356111202360117800.82021
457001203540116310.62021
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 58 } ] }, { "cell_type": "code", "source": [ "# Split data into X and y\n", "X = df.iloc[:, :-1]\n", "y = df.iloc[:, -1]" ], "metadata": { "id": "161dPMyKgx8q" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Before tuning hyperparameters, let's build a classifier so that we can obtain a baseline score as a starting point." ], "metadata": { "id": "7H3mCXj8hItV" } }, { "cell_type": "markdown", "source": [ "When fine-tuning hyperparameters, GridSearchCV and RandomizedSearchCV are the standard options. However, `cross_val_score` and `GridSearchCV/RandomizedSearchCV` do not split data the same way. One solution is to use `StratifiedKFold` whenever cross-validation is used.\n", "\n", "A stratified fold includes the same percentage of target values in each fold. If a dataset contains 60% 1s and 40% 0s in the target column, each stratified test set contains 60% 1s and 40% 0s. When folds are random, it's possible that one test set contains a 70-30 split while another contains a 50-50 split of target values." ], "metadata": { "id": "_sr6Y00Zhsyr" } }, { "cell_type": "code", "source": [ "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)" ], "metadata": { "id": "iZzwJoOFiM4V" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# The 'binary:logistic' objective is standard for binary classification in determining the loss function\n", "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=42)\n", "# Obtain scores of cross-validation\n", "scores = cross_val_score(model, X, y, cv=kfold)\n", "\n", "# Display accuracy\n", "print('Accuracy:', np.round(scores, 2))\n", "\n", "# Display mean accuracy\n", "print('Accuracy mean: %0.2f' % (scores.mean()))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-IHX7FVVg_DM", "outputId": "9079ca1c-719f-42cf-bb3a-f4bd92254fce" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: [0.85 0.72 0.74 0.82 0.78]\n", "Accuracy mean: 0.78\n" ] } ] }, { "cell_type": "markdown", "source": [ "The point here is to use the same folds to obtain new scores when fine-tuning hyperparameters with GridSearchCV and RandomizedSearchCV so that the comparison of scores is fair." ], "metadata": { "id": "fl1AJZVFiacl" } }, { "cell_type": "code", "source": [ "def grid_search(params, random=False): \n", " \n", " xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=42)\n", " \n", " kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n", " \n", " if random:\n", " grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=42)\n", " else:\n", " # Instantiate GridSearchCV as grid_reg\n", " grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)\n", " \n", " # Fit grid_reg on X_train and y_train\n", " grid.fit(X, y)\n", "\n", " # Extract best params\n", " best_params = grid.best_params_\n", "\n", " # Print best params\n", " print(\"Best params:\", best_params)\n", " \n", " # Compute best score\n", " best_score = grid.best_score_\n", "\n", " # Print best score\n", " print(\"Best score: {:.5f}\".format(best_score))" ], "metadata": { "id": "hirVfvYGimTz" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "The XGBoost hyperparameters presented here are not meant to be exhaustive, but they are meant to be comprehensive. For a complete list of hyperparameters, read the official documentation, XGBoost Parameters, at https://xgboost.readthedocs.io/en/latest/parameter.html." ], "metadata": { "id": "FL9tTjDViy0U" } }, { "cell_type": "markdown", "source": [ "#### learning_rate" ], "metadata": { "id": "C5vqCuCyjgvc" } }, { "cell_type": "markdown", "source": [ "`learning_rate` shrinks the weights of trees for each round of boosting. By lowering `learning_rate`, more trees are required to produce better scores. Lowering `learning_rate` prevents overfitting because the size of the weights carried forward is smaller.\n", "\n", "A default value of 0.3 is used. Here is a starting range for `learning_rate` as placed inside our grid_search function:" ], "metadata": { "id": "9XT4L0KzjjVd" } }, { "cell_type": "code", "source": [ "grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-Gte23anjEJL", "outputId": "3a126484-69f4-4d5a-d987-37a98f755acb" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'learning_rate': 0.5}\n", "Best score: 0.80525\n" ] } ] }, { "cell_type": "markdown", "source": [ "lowering `learning_rate` may be advantageous when `n_estimators` goes up." ], "metadata": { "id": "bOJ9FRG8kidc" } }, { "cell_type": "markdown", "source": [ "#### max_depth" ], "metadata": { "id": "y7l7T2CNkSs0" } }, { "cell_type": "markdown", "source": [ "`max_depth` determines the length of the tree, equivalent to the number of rounds of splitting. Limiting `max_depth` prevents overfitting because the individual trees can only grow as far as `max_depth` allows. XGBoost provides a default `max_depth` value of six:" ], "metadata": { "id": "45DQQ5BUkVCk" } }, { "cell_type": "code", "source": [ "grid_search(params={'max_depth':[2, 3, 5, 6, 8]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yAjw3wKmjxRH", "outputId": "faa0934a-b667-46ed-b5ae-7d5417fbc051" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'max_depth': 2}\n", "Best score: 0.79552\n" ] } ] }, { "cell_type": "markdown", "source": [ "Changing `max_depth` from 6 to 2 gave a better score. The lower value for `max_depth` means variance has been reduced." ], "metadata": { "id": "L4Yw25h0kd_m" } }, { "cell_type": "markdown", "source": [ "#### gamma" ], "metadata": { "id": "Y8K3Shxwkm80" } }, { "cell_type": "markdown", "source": [ "Known as a Lagrange multiplier, `gamma` provides a threshold that nodes must surpass before making further splits according to the loss function. There is no upper limit to the value of `gamma`. The default is 0, and anything over 10 is considered very high. Increasing `gamma` results in a more conservative model:" ], "metadata": { "id": "I7oV3JtKkoK0" } }, { "cell_type": "code", "source": [ "grid_search(params={'gamma':[0, 0.01, 0.1, 0.5, 1, 2]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3q-MZjFLkaMz", "outputId": "18e0426b-b9da-4e98-b015-3188f8d35e50" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'gamma': 1}\n", "Best score: 0.79880\n" ] } ] }, { "cell_type": "markdown", "source": [ "Changing gamma from 0 to 1 has resulted in a slight improvement." ], "metadata": { "id": "LlAtNDfuk4Yr" } }, { "cell_type": "markdown", "source": [ "#### min_child_weight" ], "metadata": { "id": "PE8jqb3lk7kb" } }, { "cell_type": "markdown", "source": [ "`min_child_weight` refers to the minimum sum of weights required for a node to split into a child. If the sum of the weights is less than the value of `min_child_weight`, no further splits are made. `min_child_weight` reduces overfitting by increasing its value:" ], "metadata": { "id": "H9m98Cg9k84E" } }, { "cell_type": "code", "source": [ "grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FXgP8k9pkyMG", "outputId": "31488e96-01c4-4c94-9a9b-af13fc07c6f8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'min_child_weight': 5}\n", "Best score: 0.81202\n" ] } ] }, { "cell_type": "markdown", "source": [ "A slight adjustment to `min_child_weight` form 1 to 5 gives the best results yet." ], "metadata": { "id": "_Uj4v6NclNTk" } }, { "cell_type": "markdown", "source": [ "#### subsample" ], "metadata": { "id": "lpUThvNwlYR0" } }, { "cell_type": "markdown", "source": [ "The `subsample` hyperparameter limits the percentage of training instances (rows) for each boosting round. Decreasing `subsample` from 100% reduces overfitting:" ], "metadata": { "id": "0IbGAEDvlaEE" } }, { "cell_type": "code", "source": [ "grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BZmXj7R8lFqH", "outputId": "fa57581a-ec6d-42e4-8310-87b1f56a5ca8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'subsample': 0.5}\n", "Best score: 0.82525\n" ] } ] }, { "cell_type": "markdown", "source": [ "The score has improved by a slight amount once again, indicating a small presence of overfitting." ], "metadata": { "id": "oT8tJABxlkLt" } }, { "cell_type": "markdown", "source": [ "#### colsample_bytree" ], "metadata": { "id": "STS11E58lpS-" } }, { "cell_type": "markdown", "source": [ "Similar to `subsample`, `colsample_bytree` randomly selects particular columns according to the given percentage. `colsample_bytree` is useful for limiting the influence of columns and reducing variance. Note that `colsample_bytree` takes a percentage as input, not the number of columns:" ], "metadata": { "id": "3k5abqx8lq_F" } }, { "cell_type": "code", "source": [ "grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "02a7625clgR7", "outputId": "a9d61e46-da14-47ce-cf48-af670272c8aa" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'colsample_bytree': 0.5}\n", "Best score: 0.79874\n" ] } ] }, { "cell_type": "markdown", "source": [ "You are encouraged to try `colsample_bylevel` and `colsample_bynode` on your own. `colsample_bylevel` randomly selects columns for each tree depth, and `colsample_bynode` randomly selects columns when evaluating each tree split." ], "metadata": { "id": "C-6xeLBMl9nt" } }, { "cell_type": "markdown", "source": [ "#### n_estimators" ], "metadata": { "id": "5ISk5sini444" } }, { "cell_type": "markdown", "source": [ "Recall that `n_estimators` provides the number of trees in the ensemble. In the case of XGBoost, `n_estimators` is the number of trees trained on the residuals. Initialize a grid search of `n_estimators` with the default of 100, then double the number of trees through 800 as follows:" ], "metadata": { "id": "QbeWxPWei76T" } }, { "cell_type": "code", "source": [ "grid_search(params={'n_estimators':[100, 200, 400, 800]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "djxGbftfhKm8", "outputId": "a420b09b-6e7a-4552-cd6f-f9cf58adbe15" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Best params: {'n_estimators': 200}\n", "Best score: 0.79219\n" ] } ] }, { "cell_type": "markdown", "source": [ "Since our dataset is small, increasing `n_estimators` did not produce better results." ], "metadata": { "id": "QYDz9YREjcLO" } }, { "cell_type": "markdown", "source": [ "#### Applying early stopping" ], "metadata": { "id": "s42ccMKbmDLF" } }, { "cell_type": "markdown", "source": [ "`early_stopping_rounds` is not a hyperparameter, but a strategy for optimizing the `n_estimators` hyperparameter.\n", "\n", "Normally when choosing hyperparameters, a test score is given after all boosting rounds are complete. To use early stopping, we need a test score after each round. `eval_metric` and `eval_set` may be used as parameters for `.fit` to generate test scores for each training round. `eval_metric` provides the scoring method, commonly 'error' for classification, and 'rmse' for regression. `eval_set` provides the test to be evaluated, commonly X_test and y_test." ], "metadata": { "id": "_f41kkk0mPil" } }, { "cell_type": "markdown", "source": [ "The following steps display an evaluation metric for each round of training with the default `n_estimators=100:`" ], "metadata": { "id": "b_ND_pUUm9md" } }, { "cell_type": "code", "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=42)\n", "eval_set = [(X_test, y_test)]\n", "eval_metric='error'\n", "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)\n", "# make predictions for test data\n", "y_pred = model.predict(X_test)\n", "# evaluate predictions\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yUgNVaUJl10d", "outputId": "d8d452c7-7fa7-4679-c98c-e04efbb02ada" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[0]\tvalidation_0-error:0.23684\n", "[1]\tvalidation_0-error:0.22368\n", "[2]\tvalidation_0-error:0.22368\n", "[3]\tvalidation_0-error:0.21053\n", "[4]\tvalidation_0-error:0.22368\n", "[5]\tvalidation_0-error:0.18421\n", "[6]\tvalidation_0-error:0.21053\n", "[7]\tvalidation_0-error:0.22368\n", "[8]\tvalidation_0-error:0.19737\n", "[9]\tvalidation_0-error:0.19737\n", "[10]\tvalidation_0-error:0.18421\n", "[11]\tvalidation_0-error:0.18421\n", "[12]\tvalidation_0-error:0.19737\n", "[13]\tvalidation_0-error:0.17105\n", "[14]\tvalidation_0-error:0.18421\n", "[15]\tvalidation_0-error:0.18421\n", "[16]\tvalidation_0-error:0.18421\n", "[17]\tvalidation_0-error:0.18421\n", "[18]\tvalidation_0-error:0.18421\n", "[19]\tvalidation_0-error:0.19737\n", "[20]\tvalidation_0-error:0.18421\n", "[21]\tvalidation_0-error:0.18421\n", "[22]\tvalidation_0-error:0.18421\n", "[23]\tvalidation_0-error:0.18421\n", "[24]\tvalidation_0-error:0.18421\n", "[25]\tvalidation_0-error:0.18421\n", "[26]\tvalidation_0-error:0.18421\n", "[27]\tvalidation_0-error:0.18421\n", "[28]\tvalidation_0-error:0.18421\n", "[29]\tvalidation_0-error:0.18421\n", "[30]\tvalidation_0-error:0.18421\n", "[31]\tvalidation_0-error:0.17105\n", "[32]\tvalidation_0-error:0.18421\n", "[33]\tvalidation_0-error:0.18421\n", "[34]\tvalidation_0-error:0.18421\n", "[35]\tvalidation_0-error:0.17105\n", "[36]\tvalidation_0-error:0.17105\n", "[37]\tvalidation_0-error:0.17105\n", "[38]\tvalidation_0-error:0.18421\n", "[39]\tvalidation_0-error:0.17105\n", "[40]\tvalidation_0-error:0.18421\n", "[41]\tvalidation_0-error:0.18421\n", "[42]\tvalidation_0-error:0.18421\n", "[43]\tvalidation_0-error:0.18421\n", "[44]\tvalidation_0-error:0.18421\n", "[45]\tvalidation_0-error:0.18421\n", "[46]\tvalidation_0-error:0.18421\n", "[47]\tvalidation_0-error:0.18421\n", "[48]\tvalidation_0-error:0.18421\n", "[49]\tvalidation_0-error:0.18421\n", "[50]\tvalidation_0-error:0.18421\n", "[51]\tvalidation_0-error:0.18421\n", "[52]\tvalidation_0-error:0.18421\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py:797: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.\n", " UserWarning,\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[53]\tvalidation_0-error:0.18421\n", "[54]\tvalidation_0-error:0.18421\n", "[55]\tvalidation_0-error:0.18421\n", "[56]\tvalidation_0-error:0.18421\n", "[57]\tvalidation_0-error:0.18421\n", "[58]\tvalidation_0-error:0.18421\n", "[59]\tvalidation_0-error:0.18421\n", "[60]\tvalidation_0-error:0.18421\n", "[61]\tvalidation_0-error:0.18421\n", "[62]\tvalidation_0-error:0.18421\n", "[63]\tvalidation_0-error:0.18421\n", "[64]\tvalidation_0-error:0.18421\n", "[65]\tvalidation_0-error:0.18421\n", "[66]\tvalidation_0-error:0.18421\n", "[67]\tvalidation_0-error:0.18421\n", "[68]\tvalidation_0-error:0.18421\n", "[69]\tvalidation_0-error:0.18421\n", "[70]\tvalidation_0-error:0.18421\n", "[71]\tvalidation_0-error:0.18421\n", "[72]\tvalidation_0-error:0.18421\n", "[73]\tvalidation_0-error:0.18421\n", "[74]\tvalidation_0-error:0.18421\n", "[75]\tvalidation_0-error:0.18421\n", "[76]\tvalidation_0-error:0.18421\n", "[77]\tvalidation_0-error:0.18421\n", "[78]\tvalidation_0-error:0.18421\n", "[79]\tvalidation_0-error:0.18421\n", "[80]\tvalidation_0-error:0.18421\n", "[81]\tvalidation_0-error:0.18421\n", "[82]\tvalidation_0-error:0.18421\n", "[83]\tvalidation_0-error:0.18421\n", "[84]\tvalidation_0-error:0.18421\n", "[85]\tvalidation_0-error:0.18421\n", "[86]\tvalidation_0-error:0.18421\n", "[87]\tvalidation_0-error:0.18421\n", "[88]\tvalidation_0-error:0.18421\n", "[89]\tvalidation_0-error:0.18421\n", "[90]\tvalidation_0-error:0.18421\n", "[91]\tvalidation_0-error:0.18421\n", "[92]\tvalidation_0-error:0.18421\n", "[93]\tvalidation_0-error:0.18421\n", "[94]\tvalidation_0-error:0.18421\n", "[95]\tvalidation_0-error:0.18421\n", "[96]\tvalidation_0-error:0.18421\n", "[97]\tvalidation_0-error:0.18421\n", "[98]\tvalidation_0-error:0.18421\n", "[99]\tvalidation_0-error:0.18421\n", "Accuracy: 81.58%\n" ] } ] }, { "cell_type": "markdown", "source": [ "We know that `StratifiedKFold` cross-validation gives a mean accuracy of 78% when n_estimators=100. The disparity in scores comes from the difference in test sets." ], "metadata": { "id": "YUYIc8cvm3X1" } }, { "cell_type": "markdown", "source": [ "#### early_stopping_rounds" ], "metadata": { "id": "rHYUq8m-nCed" } }, { "cell_type": "markdown", "source": [ "`early_stopping_rounds` is an optional parameter to include with `eval_metric` and `eval_set` when fitting a model. Let's try `early_stopping_rounds=10`.\n", "The previous code is repeated with `early_stopping_rounds=10` added in:" ], "metadata": { "id": "rbV-EIxdnD3F" } }, { "cell_type": "code", "source": [ "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=42)\n", "eval_set = [(X_test, y_test)]\n", "eval_metric=\"error\"\n", "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=10, verbose=True)\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S-8zyAq5nQQN", "outputId": "87c02e94-59b9-4db7-e953-0654af1f16c4" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[0]\tvalidation_0-error:0.23684\n", "[1]\tvalidation_0-error:0.22368\n", "[2]\tvalidation_0-error:0.22368\n", "[3]\tvalidation_0-error:0.21053\n", "[4]\tvalidation_0-error:0.22368\n", "[5]\tvalidation_0-error:0.18421\n", "[6]\tvalidation_0-error:0.21053\n", "[7]\tvalidation_0-error:0.22368\n", "[8]\tvalidation_0-error:0.19737\n", "[9]\tvalidation_0-error:0.19737\n", "[10]\tvalidation_0-error:0.18421\n", "[11]\tvalidation_0-error:0.18421\n", "[12]\tvalidation_0-error:0.19737\n", "[13]\tvalidation_0-error:0.17105\n", "[14]\tvalidation_0-error:0.18421\n", "[15]\tvalidation_0-error:0.18421\n", "[16]\tvalidation_0-error:0.18421\n", "[17]\tvalidation_0-error:0.18421\n", "[18]\tvalidation_0-error:0.18421\n", "[19]\tvalidation_0-error:0.19737\n", "[20]\tvalidation_0-error:0.18421\n", "[21]\tvalidation_0-error:0.18421\n", "[22]\tvalidation_0-error:0.18421\n", "Accuracy: 82.89%\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py:797: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.\n", " UserWarning,\n", "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py:797: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.\n", " UserWarning,\n" ] } ] }, { "cell_type": "markdown", "source": [ "A more thorough approach is to use larger values, say, `n_estimators = 5000` and `early_stopping_rounds=100`. By setting `early_stopping_rounds=100`, you are guaranteed to reach the default of 100 boosted trees presented by XGBoost.\n", "Here is the code that gives a maximum of 5,000 trees and that will stop after 100 consecutive rounds fail to find any improvement:" ], "metadata": { "id": "iFNCBPFVnoI-" } }, { "cell_type": "code", "source": [ "model = XGBClassifier(random_state=42, n_estimators=5000)\n", "eval_set = [(X_test, y_test)]\n", "eval_metric=\"error\"\n", "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TM8sE9vJmqrf", "outputId": "be9a8754-3fa0-4e34-d8a1-1a9407333ec9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[0]\tvalidation_0-error:0.23684\n", "[1]\tvalidation_0-error:0.22368\n", "[2]\tvalidation_0-error:0.22368\n", "[3]\tvalidation_0-error:0.21053\n", "[4]\tvalidation_0-error:0.22368\n", "[5]\tvalidation_0-error:0.18421\n", "[6]\tvalidation_0-error:0.21053\n", "[7]\tvalidation_0-error:0.22368\n", "[8]\tvalidation_0-error:0.19737\n", "[9]\tvalidation_0-error:0.19737\n", "[10]\tvalidation_0-error:0.18421\n", "[11]\tvalidation_0-error:0.18421\n", "[12]\tvalidation_0-error:0.19737\n", "[13]\tvalidation_0-error:0.17105\n", "[14]\tvalidation_0-error:0.18421\n", "[15]\tvalidation_0-error:0.18421\n", "[16]\tvalidation_0-error:0.18421\n", "[17]\tvalidation_0-error:0.18421\n", "[18]\tvalidation_0-error:0.18421\n", "[19]\tvalidation_0-error:0.19737\n", "[20]\tvalidation_0-error:0.18421\n", "[21]\tvalidation_0-error:0.18421\n", "[22]\tvalidation_0-error:0.18421\n", "[23]\tvalidation_0-error:0.18421\n", "[24]\tvalidation_0-error:0.18421\n", "[25]\tvalidation_0-error:0.18421\n", "[26]\tvalidation_0-error:0.18421\n", "[27]\tvalidation_0-error:0.18421\n", "[28]\tvalidation_0-error:0.18421\n", "[29]\tvalidation_0-error:0.18421\n", "[30]\tvalidation_0-error:0.18421\n", "[31]\tvalidation_0-error:0.17105\n", "[32]\tvalidation_0-error:0.18421\n", "[33]\tvalidation_0-error:0.18421\n", "[34]\tvalidation_0-error:0.18421\n", "[35]\tvalidation_0-error:0.17105\n", "[36]\tvalidation_0-error:0.17105\n", "[37]\tvalidation_0-error:0.17105\n", "[38]\tvalidation_0-error:0.18421\n", "[39]\tvalidation_0-error:0.17105\n", "[40]\tvalidation_0-error:0.18421\n", "[41]\tvalidation_0-error:0.18421\n", "[42]\tvalidation_0-error:0.18421\n", "[43]\tvalidation_0-error:0.18421\n", "[44]\tvalidation_0-error:0.18421\n", "[45]\tvalidation_0-error:0.18421\n", "[46]\tvalidation_0-error:0.18421\n", "[47]\tvalidation_0-error:0.18421\n", "[48]\tvalidation_0-error:0.18421\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py:797: UserWarning: `eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.\n", " UserWarning,\n", "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py:797: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.\n", " UserWarning,\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[49]\tvalidation_0-error:0.18421\n", "[50]\tvalidation_0-error:0.18421\n", "[51]\tvalidation_0-error:0.18421\n", "[52]\tvalidation_0-error:0.18421\n", "[53]\tvalidation_0-error:0.18421\n", "[54]\tvalidation_0-error:0.18421\n", "[55]\tvalidation_0-error:0.18421\n", "[56]\tvalidation_0-error:0.18421\n", "[57]\tvalidation_0-error:0.18421\n", "[58]\tvalidation_0-error:0.18421\n", "[59]\tvalidation_0-error:0.18421\n", "[60]\tvalidation_0-error:0.18421\n", "[61]\tvalidation_0-error:0.18421\n", "[62]\tvalidation_0-error:0.18421\n", "[63]\tvalidation_0-error:0.18421\n", "[64]\tvalidation_0-error:0.18421\n", "[65]\tvalidation_0-error:0.18421\n", "[66]\tvalidation_0-error:0.18421\n", "[67]\tvalidation_0-error:0.18421\n", "[68]\tvalidation_0-error:0.18421\n", "[69]\tvalidation_0-error:0.18421\n", "[70]\tvalidation_0-error:0.18421\n", "[71]\tvalidation_0-error:0.18421\n", "[72]\tvalidation_0-error:0.18421\n", "[73]\tvalidation_0-error:0.18421\n", "[74]\tvalidation_0-error:0.18421\n", "[75]\tvalidation_0-error:0.18421\n", "[76]\tvalidation_0-error:0.18421\n", "[77]\tvalidation_0-error:0.18421\n", "[78]\tvalidation_0-error:0.18421\n", "[79]\tvalidation_0-error:0.18421\n", "[80]\tvalidation_0-error:0.18421\n", "[81]\tvalidation_0-error:0.18421\n", "[82]\tvalidation_0-error:0.18421\n", "[83]\tvalidation_0-error:0.18421\n", "[84]\tvalidation_0-error:0.18421\n", "[85]\tvalidation_0-error:0.18421\n", "[86]\tvalidation_0-error:0.18421\n", "[87]\tvalidation_0-error:0.18421\n", "[88]\tvalidation_0-error:0.18421\n", "[89]\tvalidation_0-error:0.18421\n", "[90]\tvalidation_0-error:0.18421\n", "[91]\tvalidation_0-error:0.18421\n", "[92]\tvalidation_0-error:0.18421\n", "[93]\tvalidation_0-error:0.18421\n", "[94]\tvalidation_0-error:0.18421\n", "[95]\tvalidation_0-error:0.18421\n", "[96]\tvalidation_0-error:0.18421\n", "[97]\tvalidation_0-error:0.18421\n", "[98]\tvalidation_0-error:0.18421\n", "[99]\tvalidation_0-error:0.18421\n", "[100]\tvalidation_0-error:0.18421\n", "[101]\tvalidation_0-error:0.18421\n", "[102]\tvalidation_0-error:0.18421\n", "[103]\tvalidation_0-error:0.18421\n", "[104]\tvalidation_0-error:0.18421\n", "[105]\tvalidation_0-error:0.18421\n", "[106]\tvalidation_0-error:0.18421\n", "[107]\tvalidation_0-error:0.18421\n", "[108]\tvalidation_0-error:0.18421\n", "[109]\tvalidation_0-error:0.18421\n", "[110]\tvalidation_0-error:0.18421\n", "[111]\tvalidation_0-error:0.18421\n", "[112]\tvalidation_0-error:0.18421\n", "Accuracy: 82.89%\n" ] } ] }, { "cell_type": "markdown", "source": [ "After 100 rounds of boosting, the score provided by 13 trees is the best." ], "metadata": { "id": "bSdrD_UWn9z2" } }, { "cell_type": "markdown", "source": [ "#### Automatically hyperparamter tuning" ], "metadata": { "id": "826da7dWNGeH" } }, { "cell_type": "markdown", "source": [ "You are encourage to try https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_simple.py for hyperparameter tuning." ], "metadata": { "id": "O76mLAlyNNU-" } }, { "cell_type": "markdown", "source": [ "### For categorical variable and missing value" ], "metadata": { "id": "SHHXkhtpJ0w9" } }, { "cell_type": "markdown", "source": [ "XGBoost has experiment support for categorical variable, you can check out here: https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html. But it only works for a few tree methods, it is still recommend to encode your data https://www.kaggle.com/code/shahules/an-overview-of-encoding-techniques/notebook. Missing value, on the other hand can be handled by XGBoost as described at https://xgboost.readthedocs.io/en/stable/faq.html#how-to-deal-with-missing-values." ], "metadata": { "id": "BNREDHc_Ig5E" } }, { "cell_type": "code", "source": [ "# Select target\n", "data = pd.read_csv('melb_data.csv')\n", "y = data.Price\n", "\n", "# To keep things simple, we'll split the columns into numerical can categorical features\n", "melb_predictors = data.drop(['Price', 'Date', 'Address'], axis=1)\n", "cat_col = melb_predictors.select_dtypes(exclude=['int64','float64'])\n", "\n", "# Divide data into training and validation subsets\n", "X, X_v, y_train, y_valid = train_test_split(melb_predictors, y, train_size=0.8, test_size=0.2, random_state=0)\n", "X_train = X.select_dtypes(exclude=['object'])\n", "X_valid = X_v.select_dtypes(exclude=['object'])\n", "X_train_cat = X.select_dtypes(exclude=['int64','float64'])\n", "X_valid_cat = X_v.select_dtypes(exclude=['int64','float64'])" ], "metadata": { "id": "qFSRGgsFI4H_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for col in X_train_cat.columns:\n", " X[col] = X[col].astype('category')\n", " X_v[col] = X_v[col].astype('category')" ], "metadata": { "id": "5dJbtmI5-U7t" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror', \n", " random_state=42, n_jobs=-1) # You can either specify missing=-9999 or leave it as it is\n", "xgb.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PqKgSpglLclJ", "outputId": "cb544b30-328a-404c-a403-44aedea4f4d8" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,\n", " colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,\n", " early_stopping_rounds=None, enable_categorical=False,\n", " eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',\n", " importance_type=None, interaction_constraints='',\n", " learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,\n", " max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,\n", " missing=nan, monotone_constraints='()', n_estimators=100,\n", " n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=42,\n", " reg_alpha=0, reg_lambda=1, ...)" ] }, "metadata": {}, "execution_count": 176 } ] }, { "cell_type": "code", "source": [ "preds = xgb.predict(X_valid)" ], "metadata": { "id": "9JpLF4ECLnha" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1yuRQhnjLtLi", "outputId": "875024d9-f93b-4b39-e0af-0dcd88e1253d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "173639.76325478646" ] }, "metadata": {}, "execution_count": 178 } ] }, { "cell_type": "code", "source": [ "explainer = shap.Explainer(xgb)\n", "shap_values = explainer(X_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oVDw3PEvgj6b", "outputId": "00821643-0773-4399-958e-d155e52e3005" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "ntree_limit is deprecated, use `iteration_range` or model slicing instead.\n" ] } ] }, { "cell_type": "markdown", "source": [ "### SHAP values" ], "metadata": { "id": "Z7FHMo3tj_Bj" } }, { "cell_type": "code", "source": [ "shap.summary_plot(shap_values, X_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 411 }, "id": "v2dATeDRgwLh", "outputId": "0b950859-186a-426a-98b7-23b796a1183e" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "## Lightgbm" ], "metadata": { "id": "t58ejslZMJ_P" } }, { "cell_type": "markdown", "source": [ "### Classification task" ], "metadata": { "id": "RiQQUpHDP0In" } }, { "cell_type": "code", "source": [ "iris = datasets.load_iris()\n", "df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])\n", "X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=42)" ], "metadata": { "id": "X50nza8ILxQb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "clf = lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.1, n_estimators=100, random_state=42, n_jobs=-1)\n", "clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ff20VXGMQIXR", "outputId": "1a131638-cefd-45df-9754-5e8fd88934f0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LGBMClassifier(random_state=42)" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "clf.fit(X_train, y_train)\n", "y_pred = clf.predict(X_test)\n", "score = accuracy_score(y_pred, y_test)\n", "print('Score: ' + str(score))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zC6X-CFEQ2m3", "outputId": "d35489eb-ef3d-4ee6-a334-ddf2294e0419" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 1.0\n" ] } ] }, { "cell_type": "code", "source": [ "clf.get_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KcTMWhn8QBil", "outputId": "3e98dceb-2ef7-4c64-b601-18a0a5115e4b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'boosting_type': 'gbdt',\n", " 'class_weight': None,\n", " 'colsample_bytree': 1.0,\n", " 'importance_type': 'split',\n", " 'learning_rate': 0.1,\n", " 'max_depth': -1,\n", " 'min_child_samples': 20,\n", " 'min_child_weight': 0.001,\n", " 'min_split_gain': 0.0,\n", " 'n_estimators': 100,\n", " 'n_jobs': -1,\n", " 'num_leaves': 31,\n", " 'objective': None,\n", " 'random_state': 42,\n", " 'reg_alpha': 0.0,\n", " 'reg_lambda': 0.0,\n", " 'silent': 'warn',\n", " 'subsample': 1.0,\n", " 'subsample_for_bin': 200000,\n", " 'subsample_freq': 0}" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "markdown", "source": [ "### Regression task" ], "metadata": { "id": "W_yDFE8oRmPh" } }, { "cell_type": "code", "source": [ "X,y = datasets.load_diabetes(return_X_y=True)\n", "\n", "lgbr = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.1, n_estimators=100, random_state=42, n_jobs=-1)\n", "\n", "scores = cross_val_score(lgbr, X, y, scoring='neg_mean_squared_error', cv=5)\n", "\n", "# Take square root of the scores\n", "rmse = np.sqrt(-scores)\n", "\n", "# Display accuracy\n", "print('RMSE:', np.round(rmse, 3))\n", "\n", "# Display mean score\n", "print('RMSE mean: %0.3f' % (rmse.mean()))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SQnUryTZQ-l_", "outputId": "8e67190f-8a7c-4ad0-adfb-d54dd41d6fd9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "RMSE: [56.081 59.172 63.191 61.833 60.542]\n", "RMSE mean: 60.164\n" ] } ] }, { "cell_type": "code", "source": [ "lgbr.fit(X,y)\n", "lgbr.get_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DeDcf6bhRdOO", "outputId": "d9451285-9cee-438a-b857-58aec5d3ed3d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'boosting_type': 'gbdt',\n", " 'class_weight': None,\n", " 'colsample_bytree': 1.0,\n", " 'importance_type': 'split',\n", " 'learning_rate': 0.1,\n", " 'max_depth': -1,\n", " 'min_child_samples': 20,\n", " 'min_child_weight': 0.001,\n", " 'min_split_gain': 0.0,\n", " 'n_estimators': 100,\n", " 'n_jobs': -1,\n", " 'num_leaves': 31,\n", " 'objective': None,\n", " 'random_state': 42,\n", " 'reg_alpha': 0.0,\n", " 'reg_lambda': 0.0,\n", " 'silent': 'warn',\n", " 'subsample': 1.0,\n", " 'subsample_for_bin': 200000,\n", " 'subsample_freq': 0}" ] }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "markdown", "source": [ "### Speed" ], "metadata": { "id": "gEFp_iDMR8Jw" } }, { "cell_type": "code", "source": [ "df = pd.read_csv('exoplanets.csv')\n", "# Split data into X and y\n", "X = df.iloc[:,1:]\n", "y = df.iloc[:,0]\n", "\n", "# Split data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)" ], "metadata": { "id": "sFvSB-AoSM-g" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "# Instantiate the XGBRegressor, xg_reg\n", "lg_reg = lgb.LGBMClassifier(n_estimators=100, max_depth=2, random_state=42)\n", "\n", "# Fit xg_reg to training set\n", "lg_reg.fit(X_train, y_train)\n", "\n", "# Predict labels of test set, y_pred\n", "y_pred = lg_reg.predict(X_test)\n", "\n", "score = accuracy_score(y_pred, y_test)\n", "\n", "print('Score: ' + str(score))\n", "\n", "end = time.time()\n", "elapsed = end - start\n", "\n", "print('Run Time: ' + str(elapsed) + ' seconds')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "P2mf-IgRR3zt", "outputId": "108b41d9-5f96-48e3-d77c-5a413ee77d85" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 0.9913522012578616\n", "Run Time: 7.302560567855835 seconds\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Hyperparameter" ], "metadata": { "id": "rfvM90-HSayn" } }, { "cell_type": "markdown", "source": [ "Following set of practices can be used to improve your model efficiency.\n", "\n", "* **num_leaves** : This is the main parameter to control the complexity of the tree model. Ideally, the value of `num_leaves` should be less than or equal to 2^(max_depth). Value more than this will result in overfitting.\n", "\n", "* **min_data_in_leaf** : Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset.\n", "\n", "* **max_depth** : We also can use `max_depth` to limit the tree depth explicitly." ], "metadata": { "id": "vJIrKihuSd83" } }, { "cell_type": "markdown", "source": [ "1. **For Faster Speed**\n", "\n", "* Use bagging by setting `bagging_fraction` and `bagging_freq`.\n", "* Use feature sub-sampling by setting `feature_fraction`.\n", "* Use small `max_bin`.\n", "* Use `save_binary` to speed up data loading in future learning." ], "metadata": { "id": "Xiv_JVHiS4ww" } }, { "cell_type": "markdown", "source": [ "2. **For better accuracy**\n", "\n", "* Use large `max_bin` (may be slower).\n", "* Use small `learning_rate` with `large num_iterations`\n", "* Use large `num_leaves`(may cause over-fitting)\n", "* Try to use categorical feature directly." ], "metadata": { "id": "X4V_yKNBS_CI" } }, { "cell_type": "markdown", "source": [ "3. **To deal with over-fitting**\n", "\n", "* Use `min_data_in_leaf` and `min_sum_hessian_in_leaf`\n", "* Try `lambda_l1`, `lambda_l2` and `min_gain_to_split` to regularization\n", "* Try `max_depth` to avoid growing deep tree\n", "* Try `dart`" ], "metadata": { "id": "SFw9dI43TLbA" } }, { "cell_type": "markdown", "source": [ "Check https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html for hyperparamter tuning." ], "metadata": { "id": "OnbuqeIvTXP5" } }, { "cell_type": "markdown", "source": [ "### For categorical variable and missing value" ], "metadata": { "id": "V7LmHQzR7RtF" } }, { "cell_type": "markdown", "source": [ "LightGBM enables the missing value handle by default. See https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#missing-value-handle. It also deal with categorical variables as described here https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support" ], "metadata": { "id": "HTaO1Nx47YZd" } }, { "cell_type": "code", "source": [ "# Select target\n", "data = pd.read_csv('melb_data.csv')\n", "y = data.Price\n", "\n", "# To keep things simple, we'll split the columns into numerical can categorical features\n", "melb_predictors = data.drop(['Price', 'Date', 'Address'], axis=1)\n", "cat_col = melb_predictors.select_dtypes(exclude=['int64','float64'])\n", "\n", "# Divide data into training and validation subsets\n", "X, X_v, y_train, y_valid = train_test_split(melb_predictors, y, train_size=0.8, test_size=0.2, random_state=0)\n", "X_train = X.select_dtypes(exclude=['object'])\n", "X_valid = X_v.select_dtypes(exclude=['object'])\n", "X_train_cat = X.select_dtypes(exclude=['int64','float64'])\n", "X_valid_cat = X_v.select_dtypes(exclude=['int64','float64'])" ], "metadata": { "id": "69qkUi3qTWxS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for col in X_train_cat.columns:\n", " X[col] = X[col].astype('category')\n", " X_v[col] = X_v[col].astype('category')" ], "metadata": { "id": "N3DIZVNS9d4d" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "lgbr = lgb.LGBMRegressor(boosting_type='gbdt', random_state=42, n_jobs=-1)\n", "lgbr.fit(X, y_train)" ], "metadata": { "id": "jiF3wMm7SHRP", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8a07db4b-b48b-401f-dd98-3cfaf98f9584" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LGBMRegressor(random_state=42)" ] }, "metadata": {}, "execution_count": 158 } ] }, { "cell_type": "code", "source": [ "preds = lgbr.predict(X_v)" ], "metadata": { "id": "brGm9FFR8eIf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wpS4eGVe9ujW", "outputId": "fd13a1a8-e48c-4811-f918-d5db5fd39463" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "160267.37406568974" ] }, "metadata": {}, "execution_count": 160 } ] }, { "cell_type": "markdown", "source": [ "### SHAP values" ], "metadata": { "id": "RQpYAqIKkEM6" } }, { "cell_type": "code", "source": [ "explainer = shap.Explainer(lgbr)\n", "shap_values = explainer.shap_values(X)" ], "metadata": { "id": "RrbIM2sUg5_B" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "shap.summary_plot(shap_values, X)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "Oeql_GBTiMwh", "outputId": "08e906f1-cf14-4d3e-8eb5-997e33285888" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "source": [ "## CatBoost" ], "metadata": { "id": "tMrt7a0u-wAG" } }, { "cell_type": "markdown", "source": [ "In this section, we would explore some base cases of using catboost, such as model training, cross-validation and predicting" ], "metadata": { "id": "UpYxeNoyM7S9" } }, { "cell_type": "markdown", "source": [ "### Classification task" ], "metadata": { "id": "-LL0QrhgM_i0" } }, { "cell_type": "code", "source": [ "iris = datasets.load_iris()\n", "df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])\n", "X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=42)" ], "metadata": { "id": "a1UBXodT9yvW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "clf = CatBoostClassifier(boosting_type='Plain', learning_rate=0.1, n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent')\n", "clf.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PojeH09ENdbi", "outputId": "181716a5-6e96-4907-fd27-adb98f7e1b36" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 30 } ] }, { "cell_type": "code", "source": [ "clf.fit(X_train, y_train)\n", "y_pred = clf.predict(X_test)\n", "score = accuracy_score(y_pred, y_test)\n", "print('Score: ' + str(score))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yM5MyOi1P9Fd", "outputId": "6a64a378-6ea1-4e93-e784-2e18131690c4" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 1.0\n" ] } ] }, { "cell_type": "code", "source": [ "clf.get_all_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "B1jqoOOXOpVL", "outputId": "67ad9981-1813-4dbf-9410-2b64f0bdd01c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'auto_class_weights': 'None',\n", " 'bagging_temperature': 1,\n", " 'bayesian_matrix_reg': 0.10000000149011612,\n", " 'best_model_min_trees': 1,\n", " 'boost_from_average': False,\n", " 'boosting_type': 'Plain',\n", " 'bootstrap_type': 'Bayesian',\n", " 'border_count': 254,\n", " 'class_names': [0, 1, 2],\n", " 'classes_count': 0,\n", " 'depth': 6,\n", " 'eval_metric': 'MultiClass',\n", " 'feature_border_type': 'GreedyLogSum',\n", " 'force_unit_auto_pair_weights': False,\n", " 'grow_policy': 'SymmetricTree',\n", " 'iterations': 100,\n", " 'l2_leaf_reg': 3,\n", " 'leaf_estimation_backtracking': 'AnyImprovement',\n", " 'leaf_estimation_iterations': 1,\n", " 'leaf_estimation_method': 'Newton',\n", " 'learning_rate': 0.10000000149011612,\n", " 'loss_function': 'MultiClass',\n", " 'max_leaves': 64,\n", " 'min_data_in_leaf': 1,\n", " 'model_shrink_mode': 'Constant',\n", " 'model_shrink_rate': 0,\n", " 'model_size_reg': 0.5,\n", " 'nan_mode': 'Min',\n", " 'penalties_coefficient': 1,\n", " 'pool_metainfo_options': {'tags': {}},\n", " 'posterior_sampling': False,\n", " 'random_seed': 42,\n", " 'random_strength': 1,\n", " 'rsm': 1,\n", " 'sampling_frequency': 'PerTree',\n", " 'score_function': 'Cosine',\n", " 'sparse_features_conflict_fraction': 0,\n", " 'task_type': 'CPU',\n", " 'use_best_model': False}" ] }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "markdown", "source": [ "### Regression" ], "metadata": { "id": "Gm_vj7HGQYn1" } }, { "cell_type": "code", "source": [ "X,y = datasets.load_diabetes(return_X_y=True)\n", "\n", "catb = CatBoostRegressor(boosting_type='Plain', learning_rate=0.1, random_state=42, n_estimators=100, thread_count=-1, logging_level = 'Silent')\n", "\n", "scores = cross_val_score(catb, X, y, scoring='neg_mean_squared_error', cv=5)\n", "\n", "# Take square root of the scores\n", "rmse = np.sqrt(-scores)\n", "\n", "# Display accuracy\n", "print('RMSE:', np.round(rmse, 3))\n", "\n", "# Display mean score\n", "print('RMSE mean: %0.3f' % (rmse.mean()))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "n0pWxCpTPS1I", "outputId": "2af613d0-0a89-49e1-ef39-24156a9e18f9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "RMSE: [54.744 56.125 59.595 56.908 56.583]\n", "RMSE mean: 56.791\n" ] } ] }, { "cell_type": "code", "source": [ "catb.fit(X,y)\n", "catb.get_all_params()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RtQ7lRh9Q9v1", "outputId": "29888ec1-eb04-48b3-840e-b875f7f2cc38" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'auto_class_weights': 'None',\n", " 'bayesian_matrix_reg': 0.10000000149011612,\n", " 'best_model_min_trees': 1,\n", " 'boost_from_average': True,\n", " 'boosting_type': 'Plain',\n", " 'bootstrap_type': 'MVS',\n", " 'border_count': 254,\n", " 'classes_count': 0,\n", " 'depth': 6,\n", " 'eval_metric': 'RMSE',\n", " 'feature_border_type': 'GreedyLogSum',\n", " 'force_unit_auto_pair_weights': False,\n", " 'grow_policy': 'SymmetricTree',\n", " 'iterations': 100,\n", " 'l2_leaf_reg': 3,\n", " 'leaf_estimation_backtracking': 'AnyImprovement',\n", " 'leaf_estimation_iterations': 1,\n", " 'leaf_estimation_method': 'Newton',\n", " 'learning_rate': 0.10000000149011612,\n", " 'loss_function': 'RMSE',\n", " 'max_leaves': 64,\n", " 'min_data_in_leaf': 1,\n", " 'model_shrink_mode': 'Constant',\n", " 'model_shrink_rate': 0,\n", " 'model_size_reg': 0.5,\n", " 'nan_mode': 'Min',\n", " 'penalties_coefficient': 1,\n", " 'pool_metainfo_options': {'tags': {}},\n", " 'posterior_sampling': False,\n", " 'random_seed': 0,\n", " 'random_strength': 1,\n", " 'rsm': 1,\n", " 'sampling_frequency': 'PerTree',\n", " 'score_function': 'Cosine',\n", " 'sparse_features_conflict_fraction': 0,\n", " 'subsample': 0.800000011920929,\n", " 'task_type': 'CPU',\n", " 'use_best_model': False}" ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "markdown", "source": [ "### Speed" ], "metadata": { "id": "tIPXVGF0RlOn" } }, { "cell_type": "code", "source": [ "df = pd.read_csv('exoplanets.csv')\n", "# Split data into X and y\n", "X = df.iloc[:,1:]\n", "y = df.iloc[:,0]\n", "\n", "# Split data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)" ], "metadata": { "id": "wInbb9ASREGF" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "start = time.time()\n", "\n", "# Instantiate the XGBRegressor, xg_reg\n", "ca_reg = CatBoostClassifier(n_estimators=100, max_depth=2, random_state=42, logging_level = 'Silent')\n", "\n", "# Fit xg_reg to training set\n", "ca_reg.fit(X_train, y_train)\n", "\n", "# Predict labels of test set, y_pred\n", "y_pred = ca_reg.predict(X_test)\n", "\n", "score = accuracy_score(y_pred, y_test)\n", "\n", "print('Score: ' + str(score))\n", "\n", "end = time.time()\n", "elapsed = end - start\n", "\n", "print('Run Time: ' + str(elapsed) + ' seconds')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sUqahaIQRrvQ", "outputId": "01bc5fa3-967b-429e-dee9-d217ab87aaaa" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Score: 0.9913522012578616\n", "Run Time: 15.904590129852295 seconds\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Hyperparameter" ], "metadata": { "id": "ELzRE-rLSEgm" } }, { "cell_type": "markdown", "source": [ "You can check more details at https://catboost.ai/en/docs/references/training-parameters/. There are also tutorials about hyperparameter tuning https://github.com/catboost/tutorials/blob/master/hyperparameters_tuning/hyperparameters_tuning_using_optuna_and_hyperopt.ipynb" ], "metadata": { "id": "QPfUwGMySJW-" } }, { "cell_type": "markdown", "source": [ "### For categorical variable and missing value" ], "metadata": { "id": "Eh9WywLIS2L2" } }, { "cell_type": "markdown", "source": [ "One of the differences between CatBoost and other gradient boosting libraries is its advanced processing of the categorical features (in fact \"Cat\" in the package name stands for \"CATegorical\").\n", "\n", "CatBoost deals with the categorical data quite well out-of-the-box. However, it also has a huge number of training parameters, which provide fine control over the categorical features preprocessing." ], "metadata": { "id": "2ms2jQBMS7Pm" } }, { "cell_type": "markdown", "source": [ "The amount of parameters related to categorical features processing in CatBoost is overwhelming. Here is a hopefully the full list:\n", "\n", "* `one_hot_max_size` (int) - use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. No complex encoding is performed for such features. \n", "\n", "* `model_size_reg` (float from 0 to inf) - The model size regularization coefficient. The larger the value, the smaller the model size. This regularization is needed only for models with categorical features (other models are small). Models with categorical features might weight tens of gigabytes or more if categorical features have a lot of values. If the value of the regularizer differs from zero, then the usage of categorical features or feature combinations with a lot of values has a penalty, so fewer of them are used in the resulting model. Default value is 0.5\n", "\n", "* `max_ctr_complexity` - The maximum number of features that can be combined. Each resulting combination consists of one or more categorical features and can optionally contain binary features in the following form: “numeric feature > value”. For regression task on CPU the default value is 4.\n", "\n", "* `has_time (bool)` - if true, the 1-st step of categorical features processing, permutation, is not performed. Useful when the objects in your dataset are ordered by time. For our dataset, we don't need it. Default value is False\n", "\n", "* `simple_ctr` - Quantization settings for simple categorical features.\n", "combinations_ctr - Quantization settings for combinations of categorical features.\n", "\n", "* `per_feature_ctr` - Per-feature quantization settings for categorical features.\n", "\n", "* `counter_calc_method` determines whether to use validation dataset (provided through parameter eval_set of fit method) to estimate categories frequencies with Counter. By default, it is Full and the objects from validation dataset are used; Pass SkipTest value to ignore the objects from the validation set\n", "ctr_target_border_count - The maximum number of borders to use in target quantization for categorical features that need it. Default for regression task is 1.\n", "\n", "* `ctr_leaf_count_limit` - The maximum number of leaves with categorical features. Default value is None i.e. no limit.\n", "\n", "* `store_all_simple_ctr`- If the previous parameter ctr_leaf_count_limit at some point gradient boosting tree can no longer make splits by categorical features. With Default value False the limitation applies both to original categorical features and the features, that CatBoost creates by combining different features. If this parameter is set to True only the number of splits made on combination features is limited.\n", "\n", "The three parameters `simple_ctr`, `combinations_ctr`, and `per_feature_ctr` are complex parameters that control the second and the third steps of categorical features processing." ], "metadata": { "id": "o6zMME3hZvbw" } }, { "cell_type": "code", "source": [ "# Select target\n", "data = pd.read_csv('melb_data.csv')\n", "y = data.Price\n", "\n", "# To keep things simple, we'll split the columns into numerical can categorical features\n", "melb_predictors = data.drop(['Price', 'Date', 'Address'], axis=1)\n", "cat_col = melb_predictors.select_dtypes(exclude=['int64','float64'])\n", "\n", "# Divide data into training and validation subsets\n", "X, X_v, y_train, y_valid = train_test_split(melb_predictors, y, train_size=0.8, test_size=0.2, random_state=0)\n", "X_train = X.select_dtypes(exclude=['object'])\n", "X_valid = X_v.select_dtypes(exclude=['object'])\n", "X_train_cat = X.select_dtypes(exclude=['int64','float64'])\n", "X_valid_cat = X_v.select_dtypes(exclude=['int64','float64'])" ], "metadata": { "id": "RjqW39jGSEAG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "categorical_features_names = list(X_train_cat.columns)\n", "\n", "for col in categorical_features_names:\n", " X[col] = X.loc[:,col].fillna(value='nan')\n", " X_v[col] = X_v.loc[:,col].fillna(value='nan')\n", " X[col] = X[col].astype('category')\n", " X_v[col] = X_v[col].astype('category')" ], "metadata": { "id": "WJHEOWfJUTf4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names)\n", "catbr.fit(X, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wlDd6-DsR2_4", "outputId": "c3ed784d-3c64-45e0-f238-ce87e6dd38eb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 170 } ] }, { "cell_type": "code", "source": [ "preds = catbr.predict(X_v)" ], "metadata": { "id": "U0d4xfjRVhy6" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-x8Pex8GVjZT", "outputId": "0d1494ea-bb33-4049-e9d4-a3a2ff071259" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "164721.46472522244" ] }, "metadata": {}, "execution_count": 111 } ] }, { "cell_type": "markdown", "source": [ "The first thing we try is to make CatBoost use one-hot encoding for all our categorical features. The documentation says, that for the features for which one-hot encoding is used no other encodings are computed." ], "metadata": { "id": "jBmQWDoLYI6g" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, one_hot_max_size=500)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4Zh03QOFXndQ", "outputId": "09002728-77a8-4cbf-a533-a113efa028b4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "163165.1377008556" ] }, "metadata": {}, "execution_count": 112 } ] }, { "cell_type": "markdown", "source": [ "Let us try to set model size regularization coefficient to 0 - thus we allow our model to use as many categorical features and its combinations as it wants." ], "metadata": { "id": "N9S8KIKCa_ev" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, model_size_reg=0)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zF2EsTaUZct7", "outputId": "546c2025-155a-443b-c088-6b673e173312" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "165007.46639148306" ] }, "metadata": {}, "execution_count": 113 } ] }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, model_size_reg=1)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Evt8PjPXav1V", "outputId": "8110d5ee-b0a3-4ae8-aa13-050e6c13bc36" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "160827.54657606702" ] }, "metadata": {}, "execution_count": 114 } ] }, { "cell_type": "markdown", "source": [ "Note that any combination of several categorical features could be considered as a new one. Although it is not mentioned in the documentation, this parameter value has to be smaller than 15." ], "metadata": { "id": "gMYfIC-XbIkr" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, max_ctr_complexity=6)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Hn4vYk5Ja4vv", "outputId": "d9780de7-cc42-4787-d583-b32e0d44b403" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "163529.3517553931" ] }, "metadata": {}, "execution_count": 115 } ] }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, max_ctr_complexity=0)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gDpLpL_ybXjn", "outputId": "b7a59525-0e3b-4b91-a912-411187943ba7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "164721.46472522244" ] }, "metadata": {}, "execution_count": 116 } ] }, { "cell_type": "markdown", "source": [ "Counter method is very similar to the traditional Frequency Encoding" ], "metadata": { "id": "ygXiBf7Hcdpg" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, simple_ctr='Counter', combinations_ctr='Counter')\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Wq3m4Bo4bv9F", "outputId": "8c667b12-5b9b-4f0a-8ea0-7bdb330842ce" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "164993.47954656056" ] }, "metadata": {}, "execution_count": 118 } ] }, { "cell_type": "markdown", "source": [ "Now we proceed to the settings of the encodings methods that require target quantization. The first choice is `Borders` vs. Buckets" ], "metadata": { "id": "5O4sYVCFcztY" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, simple_ctr='Borders', combinations_ctr='Borders')\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HiUGNJBzcj4T", "outputId": "02ce18b2-9d1d-4e8b-aa41-f9a4c3db0133" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "166920.94141120723" ] }, "metadata": {}, "execution_count": 119 } ] }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, simple_ctr='Buckets', combinations_ctr='Buckets')\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fKZCjPIcc8pe", "outputId": "888d596e-0853-422c-a75d-86273c49ae25" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "168168.9386958123" ] }, "metadata": {}, "execution_count": 120 } ] }, { "cell_type": "markdown", "source": [ "It is quite common to use several encodings for a categorical feature. For instance, CatBoost creates 4 different encodings for each categorical feature by default. By default, CatBoost uses several encoding techniques to encode each categorical feature.\n", "\n", "* First it uses `Borders` method with one target border `TargetBorderCount=1` (in our example for each categorical feature we just want to see if it makes the car more expensive). The obtained float encodings are further discretized into `CtrBorderCount=15` different values. Three values of Prior parameter are used to create 3 three different encodings: `Prior=0/1:Prior=0.5/1:Prior=1/1`\n", "\n", "* Also for each categorical feature, we create an encoding with Counter method. The number of categorical encoding value borders `CtrBorderCount` is also equal to 15, and only one value of `Prior=0/1` is used." ], "metadata": { "id": "Rt8jFH_IdNL3" } }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names, ctr_target_border_count=10)\n", "catbr.fit(X, y_train)\n", "preds = catbr.predict(X_v)\n", "mean_absolute_error(y_valid, preds)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1T-Q0uVTc_7G", "outputId": "a066252c-eb26-43fc-f125-6bb7b2bcd943" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "166281.68070428775" ] }, "metadata": {}, "execution_count": 171 } ] }, { "cell_type": "code", "source": [ "catbr = CatBoostRegressor(n_estimators=100, random_state=42, thread_count=-1, logging_level = 'Silent', cat_features=categorical_features_names)\n", "catbr.fit(X, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SHRj34ZEdxzw", "outputId": "a1c40924-bb0e-4fc1-c580-a8d26afa60ee" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 172 } ] }, { "cell_type": "markdown", "source": [ "### SHAP values" ], "metadata": { "id": "JbvYB_SrkHqJ" } }, { "cell_type": "code", "source": [ "explainer = shap.Explainer(catbr)\n", "shap_values = explainer(X)" ], "metadata": { "id": "NNOn6R-dfQ5G" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "shap.summary_plot(shap_values, X)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "HymD7ehvfYav", "outputId": "4308601c-da6f-444a-e77e-b445ee2ba06e" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "hZX-5gX4f8o2" }, "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" }, "nav_menu": {}, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false }, "colab": { "name": "14_ensemble.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true } }, "nbformat": 4, "nbformat_minor": 0 }